library(tidyverse)
library(tidytext)
library(arrow)
library(quanteda)
library(quanteda.textstats)
library(NetworkInference)


filtered_newswhip <- read_parquet("data/processed/filtered_newswhip.parquet")

corpus <- quanteda::corpus(filtered_newswhip$headline_and_blurb)
tokens <- quanteda::tokens(corpus, what = "word", remove_punct = TRUE) |>
  tokens_remove(stopwords::stopwords("en", source="smart"))

ngrams <- tokens_ngrams(tokens, n = 2:3)
ngrams_dfm <- dfm(ngrams)


ngram_freq <- textstat_frequency(ngrams_dfm)
common_ngrams <- subset(ngram_freq, frequency >= 50)

print(common_ngrams)

